import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)
# First, We import the dataset
df = pd.read_csv(r'E:/assignment/Healthcare analytics/Diabetes.csv')
# Step 1: Understanding the Dataset
# Checking the shape of the dataset
data_shape = df.shape
# Understanding the data types of each column
data_types = df.dtypes
data_shape
(253680, 22)
data_types
Diabetes_binary float64 HighBP float64 HighChol float64 CholCheck float64 BMI float64 Smoker float64 Stroke float64 HeartDiseaseorAttack float64 PhysActivity float64 Fruits float64 Veggies float64 HvyAlcoholConsump float64 AnyHealthcare float64 NoDocbcCost float64 GenHlth float64 MentHlth float64 PhysHlth float64 DiffWalk float64 Sex float64 Age float64 Education float64 Income float64 dtype: object
# Convert float columns to int in df
df = df.astype(int)
data_types = df.dtypes
data_types
Diabetes_binary int32 HighBP int32 HighChol int32 CholCheck int32 BMI int32 Smoker int32 Stroke int32 HeartDiseaseorAttack int32 PhysActivity int32 Fruits int32 Veggies int32 HvyAlcoholConsump int32 AnyHealthcare int32 NoDocbcCost int32 GenHlth int32 MentHlth int32 PhysHlth int32 DiffWalk int32 Sex int32 Age int32 Education int32 Income int32 dtype: object
from tabulate import tabulate
# Descriptive statistics
desc_stats = df.describe()
# Convert the descriptive stats dataframe to a colorful table using tabulate
table = tabulate(desc_stats, headers='keys', tablefmt='grid', showindex=True, numalign="right")
import pandas as pd
# Set the option to display tables with better formatting in Jupyter
pd.set_option('display.html.table_schema', True)
# Display the descriptive statistics
desc_stats
| Diabetes_binary | HighBP | HighChol | CholCheck | BMI | Smoker | Stroke | HeartDiseaseorAttack | PhysActivity | Fruits | Veggies | HvyAlcoholConsump | AnyHealthcare | NoDocbcCost | GenHlth | MentHlth | PhysHlth | DiffWalk | Sex | Age | Education | Income | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 |
| mean | 0.139333 | 0.429001 | 0.424121 | 0.962670 | 28.382364 | 0.443169 | 0.040571 | 0.094186 | 0.756544 | 0.634256 | 0.811420 | 0.056197 | 0.951053 | 0.084177 | 2.511392 | 3.184772 | 4.242081 | 0.168224 | 0.440342 | 8.032119 | 5.050434 | 6.053875 |
| std | 0.346294 | 0.494934 | 0.494210 | 0.189571 | 6.608694 | 0.496761 | 0.197294 | 0.292087 | 0.429169 | 0.481639 | 0.391175 | 0.230302 | 0.215759 | 0.277654 | 1.068477 | 7.412847 | 8.717951 | 0.374066 | 0.496429 | 3.054220 | 0.985774 | 2.071148 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 12.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 |
| 25% | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 24.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 6.000000 | 4.000000 | 5.000000 |
| 50% | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 27.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 8.000000 | 5.000000 | 7.000000 |
| 75% | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 31.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 3.000000 | 2.000000 | 3.000000 | 0.000000 | 1.000000 | 10.000000 | 6.000000 | 8.000000 |
| max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 98.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 5.000000 | 30.000000 | 30.000000 | 1.000000 | 1.000000 | 13.000000 | 6.000000 | 8.000000 |
# Check for unique values in each column
unique_values = df.nunique()
unique_values
Diabetes_binary 2 HighBP 2 HighChol 2 CholCheck 2 BMI 84 Smoker 2 Stroke 2 HeartDiseaseorAttack 2 PhysActivity 2 Fruits 2 Veggies 2 HvyAlcoholConsump 2 AnyHealthcare 2 NoDocbcCost 2 GenHlth 5 MentHlth 31 PhysHlth 31 DiffWalk 2 Sex 2 Age 13 Education 6 Income 8 dtype: int64
# Check for missing values in each column
missing_values = df.isnull().sum()
missing_values
Diabetes_binary 0 HighBP 0 HighChol 0 CholCheck 0 BMI 0 Smoker 0 Stroke 0 HeartDiseaseorAttack 0 PhysActivity 0 Fruits 0 Veggies 0 HvyAlcoholConsump 0 AnyHealthcare 0 NoDocbcCost 0 GenHlth 0 MentHlth 0 PhysHlth 0 DiffWalk 0 Sex 0 Age 0 Education 0 Income 0 dtype: int64
# Check for duplicate rows in the dataset
duplicate_rows = df[df.duplicated()]
number_of_duplicates = len(duplicate_rows)
number_of_duplicates
24206
# Drop duplicate rows from the dataset
df = df.drop_duplicates()
# Confirm the new shape of the dataset
df
| Diabetes_binary | HighBP | HighChol | CholCheck | BMI | Smoker | Stroke | HeartDiseaseorAttack | PhysActivity | Fruits | Veggies | HvyAlcoholConsump | AnyHealthcare | NoDocbcCost | GenHlth | MentHlth | PhysHlth | DiffWalk | Sex | Age | Education | Income | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | 1 | 1 | 40 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 5 | 18 | 15 | 1 | 0 | 9 | 4 | 3 |
| 1 | 0 | 0 | 0 | 0 | 25 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 3 | 0 | 0 | 0 | 0 | 7 | 6 | 1 |
| 2 | 0 | 1 | 1 | 1 | 28 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 5 | 30 | 30 | 1 | 0 | 9 | 4 | 8 |
| 3 | 0 | 1 | 0 | 1 | 27 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 1 | 0 | 2 | 0 | 0 | 0 | 0 | 11 | 3 | 6 |
| 4 | 0 | 1 | 1 | 1 | 24 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 1 | 0 | 2 | 3 | 0 | 0 | 0 | 11 | 5 | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 253675 | 0 | 1 | 1 | 1 | 45 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 3 | 0 | 5 | 0 | 1 | 5 | 6 | 7 |
| 253676 | 1 | 1 | 1 | 1 | 18 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 4 | 0 | 0 | 1 | 0 | 11 | 2 | 4 |
| 253677 | 0 | 0 | 0 | 1 | 28 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 2 | 5 | 2 |
| 253678 | 0 | 1 | 0 | 1 | 23 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 3 | 0 | 0 | 0 | 1 | 7 | 5 | 1 |
| 253679 | 1 | 1 | 1 | 1 | 25 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 1 | 0 | 2 | 0 | 0 | 0 | 0 | 9 | 6 | 2 |
229474 rows × 22 columns
# Generate a correlogram using Seaborn's heatmap function
# Compute the correlation matrix
corr_matrix = df.corr()
plt.figure(figsize=(20, 20))
sns.heatmap(corr_matrix, cmap="RdYlGn", annot=True, fmt=".2f")
plt.title("Correlogram")
plt.show()
#pip install bokeh
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource
from bokeh.palettes import Spectral11
from bokeh.transform import factor_cmap
# Enable the inline display of Bokeh's plot in Jupyter notebook
output_notebook()
# Data for the plot
counts = df['Diabetes_binary'].value_counts()
categories = [str(x) for x in counts.index]
values = counts.values
# Create the figure
p1 = figure(x_range=categories, plot_height=350, title="Distribution of Diabetes_binary")
p1.vbar(x=categories, top=values, width=0.5, color="blue")
# Customize the plot
p1.xgrid.grid_line_color = None
p1.y_range.start = 0
p1.xaxis.axis_label = "Diabetes_binary"
p1.yaxis.axis_label = "Count"
# Display the plot in Jupyter Notebook
show(p1)
The dataset has a significant imbalance with a majority of observations categorized as "0" (No Diabetes). The number of observations labeled as "1" (Has Diabetes) is considerably smaller in comparison. Addressing this imbalance might be necessary during the modeling phase to ensure the model doesn't get biased towards the majority class.
# Bokeh code for the distribution of the BMI feature using a simple histogram
# Data for the plot
hist, edges = np.histogram(df['BMI'], bins=30)
# Create the figure
p2 = figure(plot_height=350, title="Distribution of BMI")
p2.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], fill_color="green", line_color="white", alpha=0.6)
# Customize the plot
p2.xaxis.axis_label = "BMI"
p2.yaxis.axis_label = "Count"
# Display the plot in Jupyter Notebook
show(p2)
The BMI distribution appears to be somewhat right-skewed, with most individuals having a BMI in the range of 20-30. There's a noticeable peak around the range of 25-30, suggesting many individuals are in the "Overweight" category. Fewer individuals have extremely low or high BMI values, indicating outliers or rare occurrences.
# Bokeh code for the distribution of the Age feature using a simple histogram
# Data for the plot
hist_age, edges_age = np.histogram(df['Age'], bins=30)
# Create the figure
p3 = figure(plot_height=350, title="Distribution of Age")
p3.quad(top=hist_age, bottom=0, left=edges_age[:-1], right=edges_age[1:], fill_color="orange", line_color="white", alpha=0.6)
# Customize the plot
p3.xaxis.axis_label = "Age"
p3.yaxis.axis_label = "Count"
# Display the plot in Jupyter Notebook
show(p3)
The age distribution is relatively uniform across different age groups, with some minor variations. No single age group dominates the dataset, ensuring a diverse representation of ages. The age values appear to be encoded or categorized, as there are distinct peaks, suggesting specific age ranges or groups.
import matplotlib.pyplot as plt
# Data for donut chart
labels = ['No Diabetes', 'Diabetes']
sizes = df['Diabetes_binary'].value_counts().values
colors = ['blue', 'red']
explode = (0.1, 0) # explode 1st slice for emphasis
# Plotting the Donut chart
plt.figure(figsize=(8, 6))
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140, wedgeprops=dict(width=0.3))
plt.title('Distribution of Individuals Based on Diabetes Status')
plt.show()
A significant proportion of the individuals in the dataset do not have diabetes. The portion representing individuals with diabetes is noticeably smaller, but it's still a considerable fraction. The exploded slice emphasizes the number of individuals without diabetes, making it easier to differentiate and focus on that segment.
import seaborn as sns
import matplotlib.pyplot as plt
# Generate pairwise plots using seaborn's pairplot
sns.pairplot(df[["Diabetes_binary", "Age", "BMI"]])
# Display the plot
plt.show()
The diagonal plots provide the distribution of each individual variable. Scatter plots off the diagonals show relationships between pairs of variables. No strong linear correlation is evident between the pairs of variables, but the plots help to visually detect any patterns or clusters.
pip install plotly
Defaulting to user installation because normal site-packages is not writeable Requirement already satisfied: plotly in c:\programdata\anaconda3\lib\site-packages (5.9.0) Requirement already satisfied: tenacity>=6.2.0 in c:\programdata\anaconda3\lib\site-packages (from plotly) (8.0.1) Note: you may need to restart the kernel to use updated packages.
# Assuming correlations_with_target is a DataFrame with columns "Feature" and "Correlation"
# If it's not, we might need to adjust the code accordingly.
# Determine colors for the bars based on their values
# Compute the correlation of each feature with the target variable
correlations = df.corr()['Diabetes_binary'].drop('Diabetes_binary')
# Convert the series to a DataFrame
correlations_with_target = pd.DataFrame(correlations).reset_index()
correlations_with_target.columns = ['Feature', 'Correlation']
colors = ["green" if x >= 0 else "red" for x in correlations_with_target["Correlation"]]
# Create the figure
p6 = figure(x_range=correlations_with_target["Feature"].tolist(),
plot_height=400, plot_width=800,
title="Correlation with Diabetes_binary")
p6.vbar(x=correlations_with_target["Feature"].tolist(),
top=correlations_with_target["Correlation"].tolist(),
width=0.5,
color=colors)
# Customize the plot
p6.xgrid.grid_line_color = None
p6.xaxis.major_label_orientation = 1.2
# Display the plot in Jupyter Notebook
show(p6)
High Positive Correlation:
BMI, Diabetes_pedigree_function, and Pregnancies show a positive correlation with Diabetes_binary. This suggests that higher values of these features might be associated with a higher likelihood of having diabetes.High Negative Correlation:
Sex, BloodPressure, and Insulin have a negative correlation with Diabetes_binary. This indicates that higher values of these features might be associated with a lower likelihood of having diabetes. However, it's essential to interpret these results with caution, especially since factors like blood pressure and insulin would generally be expected to have a more direct relationship with diabetes.Near Zero Correlation:
Age, HeartDisease, Glucose, and SkinThickness have correlations close to zero with Diabetes_binary. This suggests that, on their own, these features might not have a strong linear relationship with the target variable in this dataset.# Create the scatter plot using Bokeh
p7 = figure(plot_height=400, plot_width=600, title="Relationship between Physical Health and Mental Health")
p7.scatter(x=df['PhysHlth'], y=df['MentHlth'], size=5, color="blue", alpha=0.6)
# Customize the plot
p7.xaxis.axis_label = "Physical Health"
p7.yaxis.axis_label = "Mental Health"
# Display the plot in Jupyter Notebook
show(p7)
There's a spread of data points across the graph, indicating varied responses regarding physical and mental health. Some clustering can be observed along the axes, suggesting there are common durations when individuals reported no or minimal health issues. There are no apparent strong linear patterns in the scatter plot, indicating that days of physical health issues aren't directly proportional to days of mental health issues for all individuals
# Set the figure size
plt.figure(figsize=(15, 10))
# Use seaborn's boxplot function to plot all columns
sns.boxplot(data=df)
# Set the title and display the plot
plt.title("Box plots for all columns")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
Distribution Spread: Columns like Pregnancies, Glucose, BloodPressure, and SkinThickness have a wide spread of data, indicated by the size of their boxes (IQR). This suggests variability in the responses for these features.
Outliers: Several columns, such as Pregnancies, Insulin, and Diabetes_pedigree_function, have noticeable outliers. Outliers might indicate rare events, errors in data collection, or genuine extreme values.
Skewness: Some columns, like Insulin and Diabetes_pedigree_function, have a skewed distribution. The median is closer to the bottom of the box, indicating a right-skewed distribution.
Central Tendency: The Age column has its median closer to the top, suggesting that a significant portion of the dataset's individuals are younger.
Potential Data Issues: The BMI column has values at zero, which might not be realistic as a BMI of zero is not plausible. This could indicate missing or incorrectly recorded data.
# Set up the figure and axes
fig, axs = plt.subplots(2, 2, figsize=(15, 10))
# Plot 1: Sex vs. Age
sns.violinplot(x=df['Sex'], y=df['Age'], ax=axs[0, 0], palette="muted")
axs[0, 0].set_title("Age Distribution by Sex")
axs[0, 0].set_xlabel("Sex")
axs[0, 0].set_ylabel("Age")
# Plot 2: Diabetes_binary vs. Age
sns.violinplot(x=df['Diabetes_binary'], y=df['Age'], ax=axs[0, 1], palette="muted")
axs[0, 1].set_title("Age Distribution by Diabetes_binary")
axs[0, 1].set_xlabel("Diabetes_binary")
axs[0, 1].set_ylabel("Age")
# Plot 3: Diabetes_binary vs. BMI
sns.violinplot(x=df['Diabetes_binary'], y=df['BMI'], ax=axs[1, 0], palette="muted")
axs[1, 0].set_title("BMI Distribution by Diabetes_binary")
axs[1, 0].set_xlabel("Diabetes_binary")
axs[1, 0].set_ylabel("BMI")
# Plot 4: Sex vs. BMI
sns.violinplot(x=df['Sex'], y=df['BMI'], ax=axs[1, 1], palette="muted")
axs[1, 1].set_title("BMI Distribution by Sex")
axs[1, 1].set_xlabel("Sex")
axs[1, 1].set_ylabel("BMI")
# Adjust layout
plt.tight_layout()
plt.show()
The combined violin plots reveal age and BMI distributions across gender and diabetes status. Both genders exhibit similar age profiles, while individuals with diabetes tend to have a higher concentration of elevated BMI values. The age distribution for those with and without diabetes is comparable, and both genders show a broad BMI range with a central concentration.
import plotly.express as px
# Create a treemap using Plotly
fig = px.treemap(df, path=['BMI','Diabetes_binary'], title="Distribution of BMI with Diabetes")
# Display the treemap in the Jupyter Notebook
fig.show()
# Custom color palette where 0 is blue and 1 is red
custom_palette = {0: 'blue', 1: 'red'}
# Setting up the figure and axes again
fig, axs = plt.subplots(3, 2, figsize=(15, 15))
# 1. Bar Chart 1: Average Age by Gender and Diabetes Status
sns.barplot(x="Sex", y="Age", hue="Diabetes_binary", data=df, ax=axs[0, 0], palette=custom_palette)
axs[0, 0].set_title("1. Average Age by Gender and Diabetes Status")
axs[0, 0].set_xticklabels(['Female', 'Male'])
# 2. Bar Chart 2: Average BMI by Gender and Diabetes Status
sns.barplot(x="Sex", y="BMI", hue="Diabetes_binary", data=df, ax=axs[0, 1], palette=custom_palette)
axs[0, 1].set_title("2. Average BMI by Gender and Diabetes Status")
axs[0, 1].set_xticklabels(['Female', 'Male'])
# 3. Bar Chart 3: Average HighBP by Gender and Diabetes Status
sns.barplot(x="Sex", y="HighBP", hue="Diabetes_binary", data=df, ax=axs[1, 0], palette=custom_palette)
axs[1, 0].set_title("3. Average HighBP by Gender and Diabetes Status")
axs[1, 0].set_xticklabels(['Female', 'Male'])
# 4. Bar Chart 4: Count of individuals by Gender and Diabetes Status
sns.countplot(x="Sex", hue="Diabetes_binary", data=df, ax=axs[1, 1], palette=custom_palette)
axs[1, 1].set_title("4. Count of Individuals by Gender and Diabetes Status")
axs[1, 1].set_xticklabels(['Female', 'Male'])
# 5. Bar Chart 5: Count of individuals by HighBP and Diabetes Status
sns.countplot(x="HighBP", hue="Diabetes_binary", data=df, ax=axs[2, 0], palette=custom_palette)
axs[2, 0].set_title("5. Count of Individuals by HighBP and Diabetes Status")
# Adjust layout
plt.tight_layout()
plt.show()
Average Age by Gender and Diabetes Status:
Average BMI by Gender and Diabetes Status:
Average HighBP by Gender and Diabetes Status:
HighBP value than those without diabetes.Count of Individuals by Gender and Diabetes Status:
Count of Individuals by HighBP and Diabetes Status:
HighBP values than those with diabetes.HighBP values.plt.figure(figsize=(10, 6))
sns.boxplot(x='Sex', y='BMI', hue='Diabetes_binary', data=df, palette={0: 'blue', 1: 'red'})
plt.title("Box Plot: Distribution of BMI by Gender and Diabetes Status")
plt.xticks(ticks=[0, 1], labels=['Female', 'Male'])
plt.show()
A box plot can show if the median BMI for individuals with diabetes is higher than for those without diabetes across genders. The interquartile range can provide insights into the variability of BMI values for the two groups. Any outliers in the BMI distribution for either group can be easily identified using this plot.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Generate the Density Plot for BMI
plt.figure(figsize=(12, 7))
sns.kdeplot(data=df[df['Diabetes_binary'] == 0], x='BMI', hue='Sex', fill=True, common_norm=False, palette="magma", alpha=.5, linewidth=0.5)
sns.kdeplot(data=df[df['Diabetes_binary'] == 1], x='BMI', hue='Sex', fill=True, common_norm=False, palette="viridis", alpha=.5, linewidth=0.5)
plt.title("Density Plot: Distribution of BMI by Gender and Diabetes Status")
plt.show()
General Trend: The BMI distribution for both males and females tends to peak around the same value, irrespective of their diabetes status. Females without Diabetes: The distribution appears slightly more spread out compared to females with diabetes. Males with Diabetes: The distribution for males with diabetes has a noticeable peak, suggesting a specific BMI range is more common among diabetic males.
# Setting up the figure and axes for the Histograms for Age
fig, axs = plt.subplots(2, 2, figsize=(15, 12))
# 1. Histogram for Females without Diabetes
sns.histplot(data=df[(df['Diabetes_binary'] == 0) & (df['Sex'] == 0)], x='Age', color='purple', bins=30, ax=axs[0, 0], kde=True)
axs[0, 0].set_title("Females without Diabetes: Age Distribution")
# 2. Histogram for Females with Diabetes
sns.histplot(data=df[(df['Diabetes_binary'] == 1) & (df['Sex'] == 0)], x='Age', color='purple', bins=30, ax=axs[0, 1], kde=True)
axs[0, 1].set_title("Females with Diabetes: Age Distribution")
# 3. Histogram for Males without Diabetes
sns.histplot(data=df[(df['Diabetes_binary'] == 0) & (df['Sex'] == 1)], x='Age', color='orange', bins=30, ax=axs[1, 0], kde=True)
axs[1, 0].set_title("Males without Diabetes: Age Distribution")
# 4. Histogram for Males with Diabetes
sns.histplot(data=df[(df['Diabetes_binary'] == 1) & (df['Sex'] == 1)], x='Age', color='orange', bins=30, ax=axs[1, 1], kde=True)
axs[1, 1].set_title("Males with Diabetes: Age Distribution")
plt.tight_layout()
plt.show()
Females without Diabetes: The age distribution for females without diabetes peaks around the middle-aged bracket, with a broader spread indicating diversity in ages. Females with Diabetes: The age distribution has a similar peak as females without diabetes but is slightly more concentrated around the middle ages. Males without Diabetes: The age distribution for males without diabetes is notably peaked, suggesting a concentration of non-diabetic males in a specific age range. Males with Diabetes: The distribution seems broader, showing that diabetic males are distributed across various age groups, but there's still a noticeable peak in the middle age range.
# Identifying suitable categorical variables for the visualization
categorical_cols = df.nunique()[df.nunique() < 10].index.tolist()
# Removing 'Sex' and 'Diabetes_binary' as they've been addressed earlier
categorical_cols = [col for col in categorical_cols if col not in ['Sex', 'Diabetes_binary']]
# Picking the first 8 suitable categorical columns for visualization
categorical_cols_for_viz = categorical_cols[:8]
# Creating the visualization similar to the provided one
fig.suptitle("Diabetic Characteristics by Past Medical Conditions", fontsize=14)
fig, axes = plt.subplots(2, 4, figsize=(16, 8))
for i, var in enumerate(categorical_cols_for_viz):
ax = axes[i//4][i%4]
sns.countplot(x=var, hue="Diabetes_binary", data=df, ax=ax, palette={0: 'blue', 1: 'red'})
ax.set_xlabel(var, fontsize=10)
ax.set_ylabel("Count")
ax.set_title("Count of Diabetes cases by "+var, fontdict={"fontsize": 10})
# Overlaying the percentage of positive diabetes cases
ax2 = ax.twinx()
ax2 = sns.pointplot(x=var, y="Diabetes_binary", data=df.groupby(var).mean().reset_index(), color='black', markers='x', scale=0.6, ax=ax2)
ax2.set_ylim([0, 1])
ax2.set_ylabel("% Diabetic", color='black')
plt.tight_layout()
plt.show()
# Simplifying the function and changing "No" color to green
colors_simplified = ['#E63946','#50C878']
def simplified_analyse_cat(var):
(df.groupby('Diabetes_binary')[var]
.value_counts(1)
.unstack()
.iloc[:,::-1]
.plot(kind='barh', stacked=True, figsize=(10, 2), color=colors_simplified, alpha=0.8)
.spines[['top', 'right']].set_visible(False))
plt.legend(['Yes', "No"], bbox_to_anchor=(1, 1, 0, 0), shadow=False, frameon=False)
plt.yticks(ticks=[0,1], labels=['Non-Diabetic', 'Diabetic'])
plt.tight_layout()
plt.title(var, fontsize=18)
plt.show()
# Identifying boolean variables for the visualization
bool_vars = df.nunique()[df.nunique() == 2].index.tolist()
bool_vars = [var for var in bool_vars if var not in ['Sex', 'Diabetes_binary']]
# Visualizing all the variables in bool_vars using the function
for var in bool_vars:
simplified_analyse_cat(var)
# Check the distribution of the target variable
diabetes_distribution = df['Diabetes_binary'].value_counts()
diabetes_distribution
0 194377 1 35097 Name: Diabetes_binary, dtype: int64
Observation: It's essential to ensure that the target classes are almost equally represented. If not, models might show a bias towards the majority class. Method: A simple value count on the target variable, Diabetes_binary, can reveal the distribution of the two classes
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
# Splitting data into features and target variable
X = df.drop('Diabetes_binary', axis=1)
y = df['Diabetes_binary']
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=29)
# Feature selection based on ANOVA F-value
selector = SelectKBest(score_func=f_classif, k='all')
selector.fit(X_train, y_train)
# Getting the p-values of each feature
pvalues = selector.pvalues_
# Creating a DataFrame to display p-values
pvalue_df = pd.DataFrame({
'Feature': X_train.columns,
'P-Value': pvalues
})
# Sorting the features based on p-value (lower p-value indicates more significant feature)
sorted_pvalue_df = pvalue_df.sort_values(by='P-Value').reset_index(drop=True)
# Displaying the sorted p-value dataframe
sorted_pvalue_df
| Feature | P-Value | |
|---|---|---|
| 0 | HighBP | 0.000000e+00 |
| 1 | Age | 0.000000e+00 |
| 2 | DiffWalk | 0.000000e+00 |
| 3 | PhysHlth | 0.000000e+00 |
| 4 | GenHlth | 0.000000e+00 |
| 5 | Education | 0.000000e+00 |
| 6 | PhysActivity | 0.000000e+00 |
| 7 | Income | 0.000000e+00 |
| 8 | Stroke | 0.000000e+00 |
| 9 | BMI | 0.000000e+00 |
| 10 | HighChol | 0.000000e+00 |
| 11 | HeartDiseaseorAttack | 0.000000e+00 |
| 12 | CholCheck | 1.555051e-212 |
| 13 | HvyAlcoholConsump | 1.017254e-176 |
| 14 | MentHlth | 7.099456e-118 |
| 15 | Smoker | 1.925093e-85 |
| 16 | Veggies | 2.518764e-69 |
| 17 | Sex | 6.960530e-45 |
| 18 | AnyHealthcare | 7.686685e-28 |
| 19 | Fruits | 6.672511e-27 |
| 20 | NoDocbcCost | 2.156532e-16 |
Checking for Statistical Significance: Statistical tests revealed the significance of various health-related features in relation to diabetes. Features like HighBP, Age, and BMI exhibited extremely low p-values, indicating strong statistical significance. Even features with higher p-values, such as NoDocbcCost and Fruits, remained significant, underscoring their potential influence on the diabetes outcome.From the results, all the variables are statistically significant. Hence, we are not dropping any variables.
# over sampling of the dataset to get a balanced dataset
class_0 = df[df['Diabetes_binary'] == 0]
class_1 = df[df['Diabetes_binary'] == 1]
# over sampling of the minority class 1
class_1_over = class_1.sample(len(class_0), replace=True)
# Creating a new dataframe with over sampled class 1 df and class 0 df
df_new = pd.concat([class_1_over, class_0], axis=0)
# plotting the new label distribution
df_new['Diabetes_binary'].value_counts().plot(kind='bar', title='Label Distribution after Oversampling')
<Axes: title={'center': 'Label Distribution after Oversampling'}>
The dataset was found to have an imbalance in the distribution of the target variable, Diabetes_binary. To address this, oversampling was employed on the minority class (class 1). The oversampling was done to match the count of the majority class (class 0). A new dataframe, df_new, was created by merging the oversampled class 1 data with the original class 0 data. The resulting balanced dataset was then visualized, showcasing an equal distribution of both classes, as confirmed by the bar plot representing the label distribution post-oversampling.
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
X = df_new.drop('Diabetes_binary', axis = 1) # features
y = df_new[['Diabetes_binary']] # labels
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=96)
model = RandomForestClassifier(random_state=96)
# Training the model using the resampled data
model.fit(X_train, y_train)
RandomForestClassifier(random_state=96)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(random_state=96)
A Random Forest classifier was chosen to model the relationship between various health metrics and the diabetes outcome. Initially, the dataset was split into features (X) and labels (y) with Diabetes_binary being the target variable. Subsequently, the data was divided into training and testing sets, allocating 25% of the data for testing. Using the balanced dataset, the Random Forest model was trained on the training data. The use of a consistent random_state ensures reproducibility in both data splitting and model training.
from tqdm.notebook import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, BaggingClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
# List of models for training
models = [
("Logistic Regression", LogisticRegression(max_iter=1000)),
("Decision Tree", DecisionTreeClassifier()),
("Random Forest", RandomForestClassifier()),
("Gradient Boosting", GradientBoostingClassifier()),
("K-Nearest Neighbors", KNeighborsClassifier()),
("AdaBoost", AdaBoostClassifier()),
("Extra Trees", ExtraTreesClassifier()),
("Bagging", BaggingClassifier()),
("XGBoost", xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
]
# Dictionary to store performance metrics
performance = {}
# Loop through each model using tqdm for progress tracking
for name, model in tqdm(models, desc="Training Models"):
# Train the model
model.fit(X_train, y_train)
# Predictions on the test set
y_pred = model.predict(X_test)
# Predict probabilities for ROC-AUC
y_prob = model.predict_proba(X_test)[:, 1]
# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)
# Store metrics
performance[name] = {
"Accuracy": accuracy,
"Precision": precision,
"Recall": recall,
"F1 Score": f1,
"ROC-AUC": roc_auc
}
# Display the performance metrics for each model
performance_df = pd.DataFrame(performance).T
display(performance_df)
Training Models: 0%| | 0/9 [00:00<?, ?it/s]
| Accuracy | Precision | Recall | F1 Score | ROC-AUC | |
|---|---|---|---|---|---|
| Logistic Regression | 0.733005 | 0.723050 | 0.752935 | 0.737690 | 0.808834 |
| Decision Tree | 0.911677 | 0.856331 | 0.988754 | 0.917790 | 0.914252 |
| Random Forest | 0.941784 | 0.902549 | 0.990157 | 0.944325 | 0.991872 |
| Gradient Boosting | 0.740392 | 0.722382 | 0.778564 | 0.749421 | 0.818587 |
| K-Nearest Neighbors | 0.811697 | 0.748599 | 0.937042 | 0.832287 | 0.892070 |
| AdaBoost | 0.736791 | 0.724367 | 0.762139 | 0.742773 | 0.813643 |
| Extra Trees | 0.958884 | 0.933983 | 0.987330 | 0.959916 | 0.992158 |
| Bagging | 0.934015 | 0.893546 | 0.985019 | 0.937055 | 0.985818 |
| XGBoost | 0.757709 | 0.733500 | 0.807453 | 0.768702 | 0.836997 |
import matplotlib.pyplot as plt
import seaborn as sns
# Extract the performance metrics
metrics = ["Accuracy", "Precision", "Recall", "F1 Score"]
model_names = list(performance.keys())
data = {metric: [performance[model][metric] for model in model_names] for metric in metrics}
# Plot each metric
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15, 12))
axes = axes.ravel()
for idx, metric in enumerate(metrics):
sns.barplot(x=model_names, y=data[metric], ax=axes[idx])
axes[idx].set_title(metric)
axes[idx].tick_params(axis='x', rotation=45)
axes[idx].set_ylim([0, 1])
for p in axes[idx].patches:
axes[idx].annotate(f"{p.get_height():.4f}",
(p.get_x() + p.get_width() / 2., p.get_height()),
ha='center', va='center', xytext=(0, 10),
textcoords='offset points')
plt.tight_layout()
plt.show()
The graph showcases the performance metrics of various machine learning models on predicting diabetes outcomes.
Random Forest and Extra Trees are the standout performers with the highest accuracy of approximately 94.2% and 95.9% respectively. They also excel in other metrics, emphasizing their robustness and precision in predictions.
Decision Tree also demonstrates strong results with an accuracy of 91.2%, and its recall suggests it is particularly good at identifying positive diabetes cases.
Gradient Boosting, Logistic Regression, and AdaBoost hover around the same accuracy range of approximately 73-74%, indicating they might require further tuning or feature engineering for enhanced performance.
XGBoost provides an accuracy of about 75.8%, making it a middle-tier performer in this set.
K-Nearest Neighbors (KNN) and Bagging show good results with accuracies above 80%, with KNN excelling in recall, suggesting it identifies most positive cases correctly.
The ROC-AUC scores (an indicator of a model's ability to distinguish between classes) for most models are quite high, especially for Random Forest, Extra Trees, and Bagging, indicating their strong discriminative power.
In summary, while models like Random Forest and Extra Trees outshine the others in this dataset, each model has its strengths and could be further optimized based on specific use cases or objectives.
from sklearn.metrics import roc_curve, roc_auc_score, auc
# Function to plot the ROC curve
def plot_roc_curve(fpr, tpr, label=None):
plt.plot(fpr, tpr, linewidth=2, label=label)
plt.plot([0, 1], [0, 1], 'k--') # Dashed diagonal
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate (Recall)')
plt.title('ROC Curve')
plt.legend(loc="lower right")
# Plotting ROC curve for each model
plt.figure(figsize=(10, 8))
for name, model in models:
y_scores = model.predict_proba(X_test)[:, 1] # Probabilities of the positive class
fpr, tpr, thresholds = roc_curve(y_test, y_scores)
plot_roc_curve(fpr, tpr, label=f"{name} (AUC = {roc_auc_score(y_test, y_scores):.2f})")
plt.show()
the tree-based ensemble models perform better than the logistic regression and K-nearest neighbors models for this task.
Here is a more detailed interpretation of the ROC curve for each model:
Random forest (AUC = 0.99): This model has the highest AUC, indicating that it is very good at distinguishing between positive and negative cases. Extra trees (AUC = 0.99): This model is very similar to the random forest model, and it also has a very high AUC. Gradient boosting (AUC = 0.99): This model is another ensemble model that performs very well on this task. XGBoost (AUC = 0.84): This model also performs well, but it has a slightly lower AUC than the random forest, extra trees, and gradient boosting models. Logistic regression (AUC = 0.81): This model performs well, but it is not as good as the tree-based ensemble models at distinguishing between positive and negative cases. K-nearest neighbors (AUC = 0.89): This model performs the worst of the models shown on the graph. It is important to note that the performance of a machine learning model can vary depending on the specific dataset that it is trained on. Therefore, it is important to evaluate different models on a held-out test set in order to get a better estimate of their performance in the real world.
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
# Define the hyperparameters and their possible values
param_dist = {
'n_estimators': [10, 20, 30, 50, 100, 150],
'max_features': ['auto', 'sqrt', 'log2'],
'max_depth': [None, 10, 20, 30, 40, 50],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
'bootstrap': [True, False]
}
# Instantiate the Random Forest model
rf = RandomForestClassifier(random_state=42)
# Set up the RandomizedSearchCV
random_search = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=50,
cv=3, verbose=2, random_state=42, n_jobs=-1)
# Fit the RandomizedSearchCV to the data
random_search.fit(X_train, y_train)
# Extract the best hyperparameters and the best model
best_params = random_search.best_params_
best_rf_model = random_search.best_estimator_
print("Best Hyperparameters:", best_params)
Fitting 3 folds for each of 50 candidates, totalling 150 fits Best Hyperparameters: {'n_estimators': 150, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 30, 'bootstrap': False}
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
# Instantiate the model with the best hyperparameters
best_rf = RandomForestClassifier(
n_estimators=150,
min_samples_split=2,
min_samples_leaf=1,
max_features='auto',
max_depth=40,
bootstrap=False,
random_state=24
)
# Splitting the dataset
X_train_imp, X_test_imp, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=29)
# Train the model
best_rf.fit(X_train_imp, y_train)
# Predictions on the test set
y_pred_best = best_rf.predict(X_test_imp)
# Evaluate the performance
accuracy_best = accuracy_score(y_test, y_pred_best)
precision_best = precision_score(y_test, y_pred_best)
recall_best = recall_score(y_test, y_pred_best)
f1_best = f1_score(y_test, y_pred_best)
# Display performance metrics
print(f"Accuracy: {accuracy_best:.4f}")
print(f"Precision: {precision_best:.4f}")
print(f"Recall: {recall_best:.4f}")
print(f"F1 Score: {f1_best:.4f}")
# Confusion matrix visualization
conf_matrix = confusion_matrix(y_test, y_pred_best)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='Blues',
xticklabels=['No Diabetes', 'Diabetes'],
yticklabels=['No Diabetes', 'Diabetes'])
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()
Accuracy: 0.9649 Precision: 0.9435 Recall: 0.9888 F1 Score: 0.9656
The trained Random Forest classifier exhibits exceptional performance on the test set. It achieves an accuracy of 96.49%, indicating that the vast majority of predictions are correct. With a precision of 94.35%, the model demonstrates high reliability in its positive predictions. The recall, standing at 98.88%, signifies that the model successfully identifies almost all actual positive cases. The F1 score, a harmonic mean of precision and recall, is at a commendable 96.56%, further underscoring the model's robustness.
# Extract feature importances
importances_best = best_rf.feature_importances_
# Sort features by importance
sorted_indices = np.argsort(importances_best)[::-1]
# Plot the feature importances
plt.figure(figsize=(12, 6))
plt.title("Feature Importance for Tuned Random Forest Model")
plt.bar(X.columns[sorted_indices], importances_best[sorted_indices], align="center")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# Create a DataFrame to display actual vs predicted values
comparison_df = pd.DataFrame({
"Actual": y_test.values.flatten(),
"Predicted": y_pred_best
})
# Display the first 1000 rows of the comparison DataFrame in a scrollable window
from IPython.core.display import display, HTML
display(HTML(comparison_df.head(1000).to_html()))
# Display accuracy score
print(f"Accuracy Score: {accuracy_best:.4f}")
| Actual | Predicted | |
|---|---|---|
| 0 | 1 | 1 |
| 1 | 0 | 0 |
| 2 | 0 | 0 |
| 3 | 0 | 0 |
| 4 | 0 | 0 |
| 5 | 1 | 1 |
| 6 | 0 | 0 |
| 7 | 0 | 0 |
| 8 | 0 | 0 |
| 9 | 1 | 1 |
| 10 | 0 | 0 |
| 11 | 1 | 1 |
| 12 | 1 | 1 |
| 13 | 1 | 1 |
| 14 | 1 | 1 |
| 15 | 0 | 0 |
| 16 | 1 | 1 |
| 17 | 0 | 1 |
| 18 | 0 | 0 |
| 19 | 0 | 1 |
| 20 | 0 | 0 |
| 21 | 1 | 1 |
| 22 | 1 | 1 |
| 23 | 1 | 1 |
| 24 | 1 | 1 |
| 25 | 1 | 1 |
| 26 | 0 | 0 |
| 27 | 0 | 0 |
| 28 | 0 | 0 |
| 29 | 0 | 0 |
| 30 | 0 | 0 |
| 31 | 1 | 1 |
| 32 | 1 | 1 |
| 33 | 0 | 0 |
| 34 | 1 | 1 |
| 35 | 0 | 0 |
| 36 | 0 | 0 |
| 37 | 1 | 1 |
| 38 | 1 | 1 |
| 39 | 1 | 1 |
| 40 | 0 | 0 |
| 41 | 1 | 1 |
| 42 | 0 | 1 |
| 43 | 0 | 0 |
| 44 | 1 | 1 |
| 45 | 1 | 1 |
| 46 | 1 | 1 |
| 47 | 0 | 0 |
| 48 | 1 | 1 |
| 49 | 0 | 0 |
| 50 | 0 | 0 |
| 51 | 1 | 1 |
| 52 | 0 | 1 |
| 53 | 1 | 1 |
| 54 | 0 | 0 |
| 55 | 0 | 0 |
| 56 | 1 | 1 |
| 57 | 1 | 1 |
| 58 | 1 | 1 |
| 59 | 0 | 0 |
| 60 | 1 | 1 |
| 61 | 0 | 0 |
| 62 | 0 | 0 |
| 63 | 0 | 0 |
| 64 | 1 | 1 |
| 65 | 0 | 0 |
| 66 | 0 | 0 |
| 67 | 0 | 0 |
| 68 | 1 | 1 |
| 69 | 0 | 0 |
| 70 | 1 | 1 |
| 71 | 0 | 0 |
| 72 | 1 | 1 |
| 73 | 0 | 0 |
| 74 | 1 | 1 |
| 75 | 0 | 0 |
| 76 | 1 | 1 |
| 77 | 1 | 1 |
| 78 | 0 | 0 |
| 79 | 0 | 0 |
| 80 | 1 | 1 |
| 81 | 1 | 1 |
| 82 | 1 | 1 |
| 83 | 1 | 1 |
| 84 | 0 | 0 |
| 85 | 1 | 1 |
| 86 | 0 | 0 |
| 87 | 0 | 0 |
| 88 | 1 | 1 |
| 89 | 0 | 0 |
| 90 | 1 | 1 |
| 91 | 0 | 0 |
| 92 | 0 | 0 |
| 93 | 0 | 0 |
| 94 | 1 | 1 |
| 95 | 0 | 0 |
| 96 | 0 | 0 |
| 97 | 1 | 1 |
| 98 | 1 | 1 |
| 99 | 1 | 1 |
| 100 | 1 | 1 |
| 101 | 1 | 1 |
| 102 | 0 | 0 |
| 103 | 1 | 1 |
| 104 | 0 | 0 |
| 105 | 0 | 0 |
| 106 | 0 | 0 |
| 107 | 0 | 0 |
| 108 | 1 | 1 |
| 109 | 0 | 0 |
| 110 | 1 | 1 |
| 111 | 1 | 1 |
| 112 | 0 | 0 |
| 113 | 1 | 1 |
| 114 | 1 | 1 |
| 115 | 1 | 1 |
| 116 | 1 | 1 |
| 117 | 1 | 1 |
| 118 | 0 | 0 |
| 119 | 1 | 1 |
| 120 | 1 | 1 |
| 121 | 1 | 1 |
| 122 | 0 | 0 |
| 123 | 1 | 1 |
| 124 | 0 | 0 |
| 125 | 1 | 1 |
| 126 | 1 | 1 |
| 127 | 0 | 0 |
| 128 | 1 | 0 |
| 129 | 0 | 0 |
| 130 | 0 | 0 |
| 131 | 0 | 0 |
| 132 | 0 | 0 |
| 133 | 1 | 1 |
| 134 | 0 | 0 |
| 135 | 0 | 0 |
| 136 | 1 | 1 |
| 137 | 1 | 1 |
| 138 | 1 | 1 |
| 139 | 0 | 0 |
| 140 | 1 | 1 |
| 141 | 1 | 1 |
| 142 | 0 | 0 |
| 143 | 1 | 1 |
| 144 | 1 | 0 |
| 145 | 1 | 1 |
| 146 | 1 | 1 |
| 147 | 0 | 1 |
| 148 | 0 | 0 |
| 149 | 1 | 1 |
| 150 | 1 | 1 |
| 151 | 1 | 1 |
| 152 | 1 | 1 |
| 153 | 0 | 1 |
| 154 | 1 | 1 |
| 155 | 0 | 0 |
| 156 | 0 | 0 |
| 157 | 1 | 1 |
| 158 | 0 | 0 |
| 159 | 0 | 0 |
| 160 | 0 | 0 |
| 161 | 1 | 1 |
| 162 | 1 | 1 |
| 163 | 0 | 0 |
| 164 | 0 | 0 |
| 165 | 0 | 0 |
| 166 | 1 | 1 |
| 167 | 1 | 1 |
| 168 | 1 | 1 |
| 169 | 0 | 0 |
| 170 | 0 | 0 |
| 171 | 0 | 0 |
| 172 | 1 | 1 |
| 173 | 1 | 1 |
| 174 | 0 | 0 |
| 175 | 1 | 1 |
| 176 | 1 | 1 |
| 177 | 0 | 1 |
| 178 | 1 | 1 |
| 179 | 1 | 1 |
| 180 | 0 | 0 |
| 181 | 0 | 0 |
| 182 | 1 | 1 |
| 183 | 1 | 1 |
| 184 | 0 | 0 |
| 185 | 1 | 1 |
| 186 | 1 | 1 |
| 187 | 1 | 1 |
| 188 | 0 | 0 |
| 189 | 1 | 1 |
| 190 | 1 | 1 |
| 191 | 1 | 1 |
| 192 | 0 | 0 |
| 193 | 0 | 0 |
| 194 | 0 | 0 |
| 195 | 1 | 1 |
| 196 | 1 | 1 |
| 197 | 1 | 1 |
| 198 | 0 | 0 |
| 199 | 1 | 1 |
| 200 | 1 | 1 |
| 201 | 1 | 1 |
| 202 | 0 | 0 |
| 203 | 0 | 1 |
| 204 | 0 | 1 |
| 205 | 0 | 0 |
| 206 | 1 | 1 |
| 207 | 1 | 1 |
| 208 | 0 | 0 |
| 209 | 0 | 0 |
| 210 | 0 | 0 |
| 211 | 1 | 1 |
| 212 | 0 | 0 |
| 213 | 1 | 1 |
| 214 | 1 | 1 |
| 215 | 0 | 0 |
| 216 | 1 | 1 |
| 217 | 1 | 1 |
| 218 | 0 | 0 |
| 219 | 0 | 0 |
| 220 | 1 | 1 |
| 221 | 1 | 1 |
| 222 | 0 | 0 |
| 223 | 0 | 0 |
| 224 | 1 | 1 |
| 225 | 1 | 1 |
| 226 | 1 | 1 |
| 227 | 1 | 1 |
| 228 | 1 | 1 |
| 229 | 0 | 1 |
| 230 | 0 | 0 |
| 231 | 1 | 1 |
| 232 | 1 | 1 |
| 233 | 0 | 0 |
| 234 | 0 | 0 |
| 235 | 1 | 1 |
| 236 | 1 | 1 |
| 237 | 1 | 1 |
| 238 | 0 | 0 |
| 239 | 1 | 1 |
| 240 | 0 | 0 |
| 241 | 1 | 1 |
| 242 | 0 | 0 |
| 243 | 1 | 1 |
| 244 | 1 | 1 |
| 245 | 0 | 0 |
| 246 | 0 | 0 |
| 247 | 1 | 1 |
| 248 | 1 | 1 |
| 249 | 0 | 0 |
| 250 | 0 | 1 |
| 251 | 0 | 0 |
| 252 | 0 | 0 |
| 253 | 1 | 1 |
| 254 | 0 | 0 |
| 255 | 1 | 1 |
| 256 | 0 | 0 |
| 257 | 0 | 0 |
| 258 | 1 | 1 |
| 259 | 1 | 1 |
| 260 | 0 | 1 |
| 261 | 0 | 0 |
| 262 | 1 | 1 |
| 263 | 0 | 0 |
| 264 | 1 | 1 |
| 265 | 0 | 0 |
| 266 | 1 | 1 |
| 267 | 0 | 0 |
| 268 | 0 | 0 |
| 269 | 1 | 1 |
| 270 | 1 | 1 |
| 271 | 0 | 0 |
| 272 | 0 | 0 |
| 273 | 0 | 0 |
| 274 | 0 | 0 |
| 275 | 0 | 0 |
| 276 | 0 | 0 |
| 277 | 0 | 0 |
| 278 | 0 | 0 |
| 279 | 0 | 0 |
| 280 | 1 | 1 |
| 281 | 1 | 1 |
| 282 | 0 | 0 |
| 283 | 1 | 1 |
| 284 | 1 | 1 |
| 285 | 1 | 1 |
| 286 | 1 | 1 |
| 287 | 1 | 1 |
| 288 | 1 | 1 |
| 289 | 0 | 0 |
| 290 | 1 | 1 |
| 291 | 1 | 1 |
| 292 | 1 | 1 |
| 293 | 1 | 1 |
| 294 | 0 | 0 |
| 295 | 1 | 1 |
| 296 | 0 | 0 |
| 297 | 0 | 0 |
| 298 | 0 | 0 |
| 299 | 0 | 0 |
| 300 | 1 | 1 |
| 301 | 1 | 1 |
| 302 | 0 | 0 |
| 303 | 0 | 0 |
| 304 | 0 | 0 |
| 305 | 0 | 0 |
| 306 | 1 | 1 |
| 307 | 0 | 0 |
| 308 | 0 | 0 |
| 309 | 0 | 0 |
| 310 | 0 | 0 |
| 311 | 1 | 1 |
| 312 | 1 | 1 |
| 313 | 0 | 0 |
| 314 | 0 | 0 |
| 315 | 0 | 0 |
| 316 | 0 | 0 |
| 317 | 0 | 0 |
| 318 | 1 | 1 |
| 319 | 1 | 1 |
| 320 | 0 | 0 |
| 321 | 0 | 0 |
| 322 | 1 | 1 |
| 323 | 1 | 1 |
| 324 | 0 | 0 |
| 325 | 1 | 1 |
| 326 | 1 | 1 |
| 327 | 0 | 0 |
| 328 | 0 | 0 |
| 329 | 0 | 0 |
| 330 | 0 | 0 |
| 331 | 1 | 1 |
| 332 | 0 | 1 |
| 333 | 1 | 1 |
| 334 | 1 | 1 |
| 335 | 0 | 1 |
| 336 | 0 | 0 |
| 337 | 0 | 0 |
| 338 | 0 | 0 |
| 339 | 0 | 0 |
| 340 | 0 | 1 |
| 341 | 0 | 0 |
| 342 | 1 | 1 |
| 343 | 1 | 1 |
| 344 | 0 | 0 |
| 345 | 1 | 1 |
| 346 | 0 | 0 |
| 347 | 1 | 1 |
| 348 | 1 | 1 |
| 349 | 0 | 0 |
| 350 | 0 | 0 |
| 351 | 1 | 1 |
| 352 | 1 | 1 |
| 353 | 0 | 0 |
| 354 | 0 | 0 |
| 355 | 0 | 0 |
| 356 | 0 | 0 |
| 357 | 1 | 1 |
| 358 | 1 | 1 |
| 359 | 0 | 0 |
| 360 | 1 | 1 |
| 361 | 0 | 0 |
| 362 | 1 | 1 |
| 363 | 0 | 0 |
| 364 | 0 | 0 |
| 365 | 1 | 1 |
| 366 | 1 | 1 |
| 367 | 1 | 1 |
| 368 | 1 | 1 |
| 369 | 0 | 0 |
| 370 | 1 | 1 |
| 371 | 0 | 0 |
| 372 | 0 | 0 |
| 373 | 1 | 1 |
| 374 | 1 | 1 |
| 375 | 1 | 1 |
| 376 | 0 | 0 |
| 377 | 0 | 0 |
| 378 | 1 | 1 |
| 379 | 1 | 1 |
| 380 | 0 | 0 |
| 381 | 1 | 1 |
| 382 | 1 | 1 |
| 383 | 1 | 1 |
| 384 | 0 | 0 |
| 385 | 1 | 1 |
| 386 | 0 | 0 |
| 387 | 1 | 1 |
| 388 | 1 | 1 |
| 389 | 1 | 1 |
| 390 | 1 | 1 |
| 391 | 0 | 0 |
| 392 | 1 | 1 |
| 393 | 0 | 0 |
| 394 | 0 | 0 |
| 395 | 0 | 0 |
| 396 | 1 | 1 |
| 397 | 1 | 1 |
| 398 | 0 | 0 |
| 399 | 0 | 0 |
| 400 | 0 | 0 |
| 401 | 0 | 0 |
| 402 | 1 | 1 |
| 403 | 0 | 0 |
| 404 | 0 | 0 |
| 405 | 0 | 0 |
| 406 | 1 | 1 |
| 407 | 1 | 1 |
| 408 | 0 | 0 |
| 409 | 1 | 1 |
| 410 | 0 | 1 |
| 411 | 1 | 1 |
| 412 | 0 | 0 |
| 413 | 1 | 1 |
| 414 | 1 | 1 |
| 415 | 0 | 0 |
| 416 | 1 | 1 |
| 417 | 1 | 1 |
| 418 | 0 | 0 |
| 419 | 1 | 1 |
| 420 | 1 | 1 |
| 421 | 0 | 0 |
| 422 | 0 | 0 |
| 423 | 0 | 0 |
| 424 | 1 | 1 |
| 425 | 0 | 0 |
| 426 | 1 | 1 |
| 427 | 1 | 1 |
| 428 | 1 | 1 |
| 429 | 0 | 0 |
| 430 | 1 | 1 |
| 431 | 1 | 1 |
| 432 | 0 | 0 |
| 433 | 0 | 0 |
| 434 | 0 | 0 |
| 435 | 0 | 0 |
| 436 | 0 | 0 |
| 437 | 1 | 1 |
| 438 | 0 | 0 |
| 439 | 0 | 0 |
| 440 | 0 | 0 |
| 441 | 0 | 0 |
| 442 | 1 | 1 |
| 443 | 1 | 1 |
| 444 | 0 | 0 |
| 445 | 1 | 1 |
| 446 | 1 | 1 |
| 447 | 0 | 1 |
| 448 | 0 | 1 |
| 449 | 0 | 0 |
| 450 | 0 | 0 |
| 451 | 1 | 1 |
| 452 | 0 | 0 |
| 453 | 1 | 1 |
| 454 | 0 | 0 |
| 455 | 1 | 1 |
| 456 | 1 | 1 |
| 457 | 0 | 0 |
| 458 | 1 | 1 |
| 459 | 0 | 0 |
| 460 | 1 | 1 |
| 461 | 0 | 0 |
| 462 | 0 | 0 |
| 463 | 1 | 1 |
| 464 | 1 | 1 |
| 465 | 0 | 0 |
| 466 | 1 | 1 |
| 467 | 0 | 0 |
| 468 | 1 | 1 |
| 469 | 1 | 1 |
| 470 | 1 | 1 |
| 471 | 0 | 0 |
| 472 | 1 | 1 |
| 473 | 0 | 0 |
| 474 | 1 | 1 |
| 475 | 1 | 1 |
| 476 | 1 | 1 |
| 477 | 0 | 0 |
| 478 | 0 | 0 |
| 479 | 0 | 0 |
| 480 | 0 | 0 |
| 481 | 1 | 1 |
| 482 | 0 | 0 |
| 483 | 1 | 1 |
| 484 | 0 | 0 |
| 485 | 1 | 1 |
| 486 | 0 | 0 |
| 487 | 1 | 1 |
| 488 | 1 | 1 |
| 489 | 1 | 1 |
| 490 | 0 | 0 |
| 491 | 1 | 1 |
| 492 | 1 | 1 |
| 493 | 1 | 1 |
| 494 | 0 | 0 |
| 495 | 0 | 0 |
| 496 | 0 | 0 |
| 497 | 1 | 1 |
| 498 | 0 | 0 |
| 499 | 0 | 0 |
| 500 | 0 | 0 |
| 501 | 1 | 1 |
| 502 | 1 | 1 |
| 503 | 1 | 1 |
| 504 | 1 | 1 |
| 505 | 1 | 1 |
| 506 | 0 | 0 |
| 507 | 1 | 1 |
| 508 | 0 | 0 |
| 509 | 0 | 0 |
| 510 | 1 | 1 |
| 511 | 1 | 1 |
| 512 | 1 | 1 |
| 513 | 0 | 0 |
| 514 | 0 | 0 |
| 515 | 0 | 0 |
| 516 | 0 | 0 |
| 517 | 0 | 0 |
| 518 | 0 | 0 |
| 519 | 0 | 0 |
| 520 | 0 | 0 |
| 521 | 0 | 0 |
| 522 | 1 | 1 |
| 523 | 0 | 0 |
| 524 | 1 | 1 |
| 525 | 1 | 1 |
| 526 | 1 | 1 |
| 527 | 0 | 0 |
| 528 | 1 | 1 |
| 529 | 1 | 1 |
| 530 | 1 | 1 |
| 531 | 0 | 0 |
| 532 | 0 | 0 |
| 533 | 1 | 1 |
| 534 | 0 | 0 |
| 535 | 1 | 1 |
| 536 | 0 | 0 |
| 537 | 0 | 0 |
| 538 | 1 | 1 |
| 539 | 0 | 0 |
| 540 | 1 | 1 |
| 541 | 0 | 0 |
| 542 | 1 | 1 |
| 543 | 1 | 1 |
| 544 | 0 | 0 |
| 545 | 0 | 0 |
| 546 | 1 | 1 |
| 547 | 0 | 0 |
| 548 | 0 | 0 |
| 549 | 0 | 0 |
| 550 | 0 | 0 |
| 551 | 1 | 1 |
| 552 | 0 | 0 |
| 553 | 1 | 1 |
| 554 | 1 | 1 |
| 555 | 0 | 0 |
| 556 | 1 | 1 |
| 557 | 1 | 1 |
| 558 | 1 | 1 |
| 559 | 1 | 1 |
| 560 | 1 | 1 |
| 561 | 1 | 1 |
| 562 | 0 | 0 |
| 563 | 0 | 0 |
| 564 | 0 | 0 |
| 565 | 0 | 1 |
| 566 | 0 | 0 |
| 567 | 1 | 1 |
| 568 | 0 | 0 |
| 569 | 0 | 0 |
| 570 | 0 | 1 |
| 571 | 0 | 1 |
| 572 | 1 | 1 |
| 573 | 1 | 1 |
| 574 | 0 | 0 |
| 575 | 0 | 0 |
| 576 | 1 | 1 |
| 577 | 0 | 0 |
| 578 | 0 | 0 |
| 579 | 1 | 1 |
| 580 | 1 | 1 |
| 581 | 0 | 0 |
| 582 | 1 | 1 |
| 583 | 0 | 0 |
| 584 | 1 | 1 |
| 585 | 1 | 1 |
| 586 | 1 | 1 |
| 587 | 1 | 1 |
| 588 | 1 | 1 |
| 589 | 1 | 1 |
| 590 | 0 | 0 |
| 591 | 1 | 1 |
| 592 | 0 | 0 |
| 593 | 1 | 1 |
| 594 | 0 | 0 |
| 595 | 1 | 1 |
| 596 | 1 | 1 |
| 597 | 0 | 0 |
| 598 | 1 | 1 |
| 599 | 0 | 1 |
| 600 | 1 | 1 |
| 601 | 0 | 0 |
| 602 | 0 | 0 |
| 603 | 1 | 1 |
| 604 | 0 | 0 |
| 605 | 1 | 1 |
| 606 | 0 | 0 |
| 607 | 0 | 0 |
| 608 | 1 | 1 |
| 609 | 0 | 0 |
| 610 | 1 | 1 |
| 611 | 0 | 0 |
| 612 | 0 | 0 |
| 613 | 1 | 1 |
| 614 | 0 | 0 |
| 615 | 1 | 1 |
| 616 | 0 | 0 |
| 617 | 0 | 0 |
| 618 | 0 | 1 |
| 619 | 1 | 1 |
| 620 | 1 | 1 |
| 621 | 1 | 1 |
| 622 | 1 | 1 |
| 623 | 0 | 1 |
| 624 | 1 | 1 |
| 625 | 1 | 1 |
| 626 | 0 | 0 |
| 627 | 0 | 0 |
| 628 | 1 | 1 |
| 629 | 0 | 0 |
| 630 | 1 | 1 |
| 631 | 0 | 0 |
| 632 | 0 | 0 |
| 633 | 1 | 1 |
| 634 | 1 | 1 |
| 635 | 0 | 0 |
| 636 | 0 | 0 |
| 637 | 1 | 1 |
| 638 | 0 | 0 |
| 639 | 1 | 1 |
| 640 | 0 | 0 |
| 641 | 0 | 0 |
| 642 | 1 | 1 |
| 643 | 0 | 0 |
| 644 | 0 | 0 |
| 645 | 1 | 1 |
| 646 | 1 | 1 |
| 647 | 0 | 0 |
| 648 | 1 | 1 |
| 649 | 1 | 1 |
| 650 | 0 | 0 |
| 651 | 0 | 0 |
| 652 | 0 | 1 |
| 653 | 1 | 1 |
| 654 | 1 | 1 |
| 655 | 0 | 0 |
| 656 | 0 | 0 |
| 657 | 0 | 0 |
| 658 | 0 | 0 |
| 659 | 1 | 1 |
| 660 | 0 | 0 |
| 661 | 0 | 0 |
| 662 | 0 | 0 |
| 663 | 0 | 0 |
| 664 | 1 | 1 |
| 665 | 1 | 1 |
| 666 | 1 | 1 |
| 667 | 1 | 1 |
| 668 | 0 | 0 |
| 669 | 1 | 1 |
| 670 | 1 | 1 |
| 671 | 1 | 1 |
| 672 | 1 | 1 |
| 673 | 0 | 0 |
| 674 | 0 | 0 |
| 675 | 1 | 1 |
| 676 | 0 | 1 |
| 677 | 0 | 1 |
| 678 | 1 | 1 |
| 679 | 0 | 1 |
| 680 | 1 | 1 |
| 681 | 1 | 1 |
| 682 | 1 | 1 |
| 683 | 1 | 1 |
| 684 | 1 | 1 |
| 685 | 0 | 0 |
| 686 | 0 | 0 |
| 687 | 0 | 0 |
| 688 | 1 | 1 |
| 689 | 0 | 0 |
| 690 | 1 | 1 |
| 691 | 0 | 0 |
| 692 | 1 | 1 |
| 693 | 0 | 0 |
| 694 | 1 | 1 |
| 695 | 1 | 1 |
| 696 | 0 | 0 |
| 697 | 1 | 1 |
| 698 | 0 | 0 |
| 699 | 1 | 1 |
| 700 | 0 | 0 |
| 701 | 1 | 1 |
| 702 | 1 | 1 |
| 703 | 0 | 0 |
| 704 | 1 | 1 |
| 705 | 1 | 1 |
| 706 | 1 | 1 |
| 707 | 0 | 0 |
| 708 | 1 | 1 |
| 709 | 1 | 1 |
| 710 | 1 | 1 |
| 711 | 0 | 0 |
| 712 | 1 | 1 |
| 713 | 1 | 1 |
| 714 | 0 | 0 |
| 715 | 0 | 0 |
| 716 | 0 | 0 |
| 717 | 1 | 1 |
| 718 | 1 | 1 |
| 719 | 1 | 1 |
| 720 | 1 | 1 |
| 721 | 0 | 0 |
| 722 | 1 | 1 |
| 723 | 0 | 0 |
| 724 | 0 | 0 |
| 725 | 0 | 0 |
| 726 | 1 | 1 |
| 727 | 0 | 0 |
| 728 | 1 | 1 |
| 729 | 1 | 1 |
| 730 | 0 | 0 |
| 731 | 1 | 1 |
| 732 | 1 | 1 |
| 733 | 1 | 1 |
| 734 | 1 | 1 |
| 735 | 0 | 0 |
| 736 | 1 | 1 |
| 737 | 1 | 1 |
| 738 | 1 | 1 |
| 739 | 0 | 1 |
| 740 | 1 | 1 |
| 741 | 1 | 1 |
| 742 | 1 | 1 |
| 743 | 0 | 0 |
| 744 | 0 | 0 |
| 745 | 0 | 0 |
| 746 | 1 | 1 |
| 747 | 0 | 0 |
| 748 | 0 | 0 |
| 749 | 1 | 1 |
| 750 | 0 | 0 |
| 751 | 1 | 1 |
| 752 | 0 | 0 |
| 753 | 1 | 1 |
| 754 | 0 | 0 |
| 755 | 1 | 1 |
| 756 | 0 | 0 |
| 757 | 0 | 0 |
| 758 | 0 | 0 |
| 759 | 1 | 1 |
| 760 | 0 | 0 |
| 761 | 0 | 1 |
| 762 | 1 | 1 |
| 763 | 0 | 0 |
| 764 | 0 | 0 |
| 765 | 1 | 1 |
| 766 | 1 | 1 |
| 767 | 0 | 0 |
| 768 | 1 | 1 |
| 769 | 1 | 1 |
| 770 | 0 | 0 |
| 771 | 1 | 1 |
| 772 | 1 | 1 |
| 773 | 0 | 0 |
| 774 | 0 | 0 |
| 775 | 1 | 1 |
| 776 | 1 | 1 |
| 777 | 0 | 0 |
| 778 | 1 | 1 |
| 779 | 1 | 1 |
| 780 | 1 | 1 |
| 781 | 0 | 0 |
| 782 | 0 | 0 |
| 783 | 0 | 0 |
| 784 | 0 | 0 |
| 785 | 1 | 1 |
| 786 | 0 | 0 |
| 787 | 1 | 1 |
| 788 | 1 | 1 |
| 789 | 1 | 1 |
| 790 | 0 | 0 |
| 791 | 0 | 0 |
| 792 | 0 | 0 |
| 793 | 0 | 0 |
| 794 | 1 | 1 |
| 795 | 0 | 0 |
| 796 | 0 | 0 |
| 797 | 0 | 0 |
| 798 | 1 | 1 |
| 799 | 0 | 0 |
| 800 | 1 | 1 |
| 801 | 1 | 1 |
| 802 | 0 | 0 |
| 803 | 1 | 1 |
| 804 | 0 | 0 |
| 805 | 1 | 0 |
| 806 | 0 | 0 |
| 807 | 0 | 0 |
| 808 | 1 | 1 |
| 809 | 1 | 1 |
| 810 | 1 | 1 |
| 811 | 1 | 1 |
| 812 | 1 | 1 |
| 813 | 1 | 1 |
| 814 | 0 | 0 |
| 815 | 0 | 0 |
| 816 | 0 | 0 |
| 817 | 0 | 0 |
| 818 | 0 | 0 |
| 819 | 0 | 0 |
| 820 | 0 | 1 |
| 821 | 0 | 0 |
| 822 | 0 | 0 |
| 823 | 1 | 1 |
| 824 | 1 | 1 |
| 825 | 1 | 1 |
| 826 | 1 | 1 |
| 827 | 1 | 1 |
| 828 | 1 | 1 |
| 829 | 0 | 0 |
| 830 | 1 | 1 |
| 831 | 1 | 1 |
| 832 | 1 | 1 |
| 833 | 0 | 0 |
| 834 | 1 | 1 |
| 835 | 1 | 1 |
| 836 | 0 | 0 |
| 837 | 0 | 0 |
| 838 | 1 | 1 |
| 839 | 0 | 0 |
| 840 | 1 | 1 |
| 841 | 1 | 1 |
| 842 | 1 | 1 |
| 843 | 0 | 0 |
| 844 | 0 | 0 |
| 845 | 0 | 0 |
| 846 | 0 | 0 |
| 847 | 0 | 1 |
| 848 | 0 | 0 |
| 849 | 0 | 1 |
| 850 | 1 | 1 |
| 851 | 0 | 0 |
| 852 | 0 | 0 |
| 853 | 0 | 0 |
| 854 | 0 | 0 |
| 855 | 0 | 0 |
| 856 | 1 | 1 |
| 857 | 0 | 1 |
| 858 | 1 | 1 |
| 859 | 1 | 1 |
| 860 | 0 | 0 |
| 861 | 0 | 0 |
| 862 | 0 | 0 |
| 863 | 0 | 0 |
| 864 | 0 | 0 |
| 865 | 1 | 1 |
| 866 | 0 | 0 |
| 867 | 1 | 1 |
| 868 | 0 | 0 |
| 869 | 1 | 1 |
| 870 | 1 | 1 |
| 871 | 0 | 0 |
| 872 | 0 | 0 |
| 873 | 0 | 0 |
| 874 | 0 | 0 |
| 875 | 1 | 1 |
| 876 | 0 | 0 |
| 877 | 0 | 1 |
| 878 | 0 | 0 |
| 879 | 0 | 0 |
| 880 | 1 | 1 |
| 881 | 1 | 1 |
| 882 | 1 | 1 |
| 883 | 1 | 1 |
| 884 | 0 | 0 |
| 885 | 0 | 0 |
| 886 | 1 | 1 |
| 887 | 0 | 0 |
| 888 | 1 | 1 |
| 889 | 0 | 0 |
| 890 | 0 | 0 |
| 891 | 0 | 0 |
| 892 | 1 | 1 |
| 893 | 1 | 1 |
| 894 | 0 | 0 |
| 895 | 0 | 0 |
| 896 | 0 | 0 |
| 897 | 0 | 0 |
| 898 | 1 | 1 |
| 899 | 0 | 0 |
| 900 | 1 | 1 |
| 901 | 1 | 1 |
| 902 | 0 | 0 |
| 903 | 1 | 1 |
| 904 | 1 | 1 |
| 905 | 1 | 1 |
| 906 | 1 | 1 |
| 907 | 0 | 0 |
| 908 | 0 | 0 |
| 909 | 1 | 1 |
| 910 | 0 | 0 |
| 911 | 0 | 0 |
| 912 | 1 | 1 |
| 913 | 0 | 0 |
| 914 | 0 | 0 |
| 915 | 0 | 0 |
| 916 | 0 | 1 |
| 917 | 1 | 1 |
| 918 | 0 | 1 |
| 919 | 1 | 1 |
| 920 | 1 | 1 |
| 921 | 0 | 0 |
| 922 | 0 | 0 |
| 923 | 0 | 0 |
| 924 | 0 | 0 |
| 925 | 0 | 0 |
| 926 | 0 | 0 |
| 927 | 0 | 0 |
| 928 | 1 | 1 |
| 929 | 0 | 0 |
| 930 | 0 | 0 |
| 931 | 0 | 0 |
| 932 | 1 | 1 |
| 933 | 1 | 1 |
| 934 | 0 | 0 |
| 935 | 1 | 1 |
| 936 | 1 | 1 |
| 937 | 1 | 1 |
| 938 | 1 | 1 |
| 939 | 1 | 1 |
| 940 | 0 | 0 |
| 941 | 0 | 0 |
| 942 | 0 | 0 |
| 943 | 0 | 0 |
| 944 | 0 | 0 |
| 945 | 0 | 0 |
| 946 | 1 | 1 |
| 947 | 0 | 0 |
| 948 | 1 | 1 |
| 949 | 0 | 0 |
| 950 | 1 | 1 |
| 951 | 0 | 0 |
| 952 | 0 | 0 |
| 953 | 1 | 1 |
| 954 | 0 | 0 |
| 955 | 1 | 1 |
| 956 | 0 | 0 |
| 957 | 1 | 1 |
| 958 | 0 | 0 |
| 959 | 0 | 0 |
| 960 | 0 | 0 |
| 961 | 0 | 0 |
| 962 | 0 | 0 |
| 963 | 0 | 0 |
| 964 | 0 | 0 |
| 965 | 1 | 1 |
| 966 | 0 | 0 |
| 967 | 0 | 0 |
| 968 | 1 | 1 |
| 969 | 1 | 1 |
| 970 | 1 | 1 |
| 971 | 0 | 0 |
| 972 | 1 | 1 |
| 973 | 0 | 0 |
| 974 | 0 | 0 |
| 975 | 1 | 0 |
| 976 | 1 | 1 |
| 977 | 1 | 1 |
| 978 | 0 | 0 |
| 979 | 0 | 0 |
| 980 | 1 | 1 |
| 981 | 1 | 1 |
| 982 | 0 | 0 |
| 983 | 1 | 1 |
| 984 | 0 | 0 |
| 985 | 1 | 1 |
| 986 | 0 | 0 |
| 987 | 1 | 1 |
| 988 | 1 | 1 |
| 989 | 0 | 0 |
| 990 | 1 | 1 |
| 991 | 0 | 0 |
| 992 | 0 | 0 |
| 993 | 0 | 0 |
| 994 | 1 | 1 |
| 995 | 1 | 1 |
| 996 | 1 | 1 |
| 997 | 0 | 0 |
| 998 | 0 | 0 |
| 999 | 0 | 0 |
Accuracy Score: 0.9649
Comparison of the actual vs. predicted diabetes outcomes for 1000 data points. The model correctly predicted most of the cases, as indicated by the matching values. The overall accuracy score of the model is 96.49%, reflecting its high proficiency in making accurate predictions.
#!pip install shap
import shap
# Initialize JavaScript visualizations for SHAP
shap.initjs()
# Create a TreeExplainer for the Random Forest model
explainer = shap.TreeExplainer(best_rf)
# Calculate SHAP values for a subset of your data (for performance reasons)
shap_values = explainer.shap_values(X_test.iloc[:100])
shap.summary_plot(shap_values, X_test.iloc[:100])
# For the first instance in the test set
shap.force_plot(explainer.expected_value[1], shap_values[1][0], X_test.iloc[0])
# For the first 50 instances in the test set
shap.force_plot(explainer.expected_value[1], shap_values[1][:50], X_test.iloc[:50])
# For a specific feature, say 'Age'
shap.dependence_plot("Age", shap_values[1], X_test.iloc[:100])
The summary plot provides an overview of the feature importances and their impact on the model's predictions.
Interpretation:
The force plot for a single instance shows the contribution of each feature for that specific instance.
Interpretation:
This visualization shows the contributions of each feature for multiple instances.
Interpretation:
The dependence plot shows how the model output varies by feature value, often revealing more intricate patterns and interactions.
Interpretation: